Chapter 3 Data quality

load("data/data.Rdata")

3.1 Load statistics

read_stats <- read_tsv(c("https://sid.erda.dk/share_redirect/D9oEqxuxql/reports/by_step/reads_data/multiqc_general_stats.txt",
                         "https://sid.erda.dk/share_redirect/e0EsEFnsKu/reports/by_step/reads_data/multiqc_general_stats.txt")) %>%
   mutate(Sample = str_extract(Sample, "M\\d+")) %>%
  rename_with(~str_remove(., "FastQC_mqc-generalstats-fastqc-"), starts_with("FastQC_mqc-generalstats-fastqc-")) %>%
  rename(microsample=Sample) %>%
  select(microsample,percent_duplicates,total_sequences, percent_gc) %>%
  group_by(microsample) %>%
  summarise(total_sequences=sum(total_sequences), percent_duplicates=mean(percent_duplicates), percent_gc=mean(percent_gc))
host_mapping_stats <- read_tsv(c("https://sid.erda.dk/share_redirect/b9ZbTPY0kQ/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt",
                                 "https://sid.erda.dk/share_redirect/e0EsEFnsKu/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt")) %>%
    mutate(reference = case_when(
        grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
        grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
        TRUE ~ NA )) %>%
    filter(reference=="chicken") %>%
    mutate(Sample = str_extract(Sample, "M\\d+")) %>%
    rename(microsample=Sample,reads_mapped_host=mapped_passed,reads_mapped_host_percent=mapped_passed_pct) %>%
    select(microsample,reads_mapped_host,reads_mapped_host_percent) %>%
    group_by(microsample) %>%
    summarise(reads_mapped_host=sum(reads_mapped_host),reads_mapped_host_percent=mean(reads_mapped_host_percent))
human_mapping_stats <- read_tsv(c("https://sid.erda.dk/share_redirect/b9ZbTPY0kQ/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt",
                                 "https://sid.erda.dk/share_redirect/e0EsEFnsKu/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt")) %>%
    mutate(reference = case_when(
        grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
        grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
        TRUE ~ NA )) %>%
    filter(reference=="human") %>%
    mutate(Sample = str_extract(Sample, "M\\d+")) %>%
    rename(microsample=Sample, reads_mapped_human=mapped_passed,reads_mapped_human_percent=mapped_passed_pct) %>%
    select(microsample,reads_mapped_human,reads_mapped_human_percent) %>%
    group_by(microsample) %>%
    summarise(reads_mapped_human=sum(reads_mapped_human),reads_mapped_human_percent=mean(reads_mapped_human_percent))
quantification_stats  <- read_tsv(c(
  "https://sid.erda.dk/share_redirect/D9oEqxuxql/reports/by_step/quantify_data/multiqc_general_stats.txt",
   "https://sid.erda.dk/share_redirect/e0EsEFnsKu/reports/by_step/quantify_data/multiqc_general_stats.txt")) %>%
   filter(str_detect(Sample, "mgg-pbdrep")) %>%
   mutate(Sample = str_extract(Sample, "M\\d+")) %>%
   rename_with(~str_remove(., "Samtools: stats_mqc-generalstats-samtools_stats-"), 
               starts_with("Samtools: stats_mqc-generalstats-samtools_stats-")) %>%
  rename_with(~str_remove(., "Samtools: flagstat_mqc-generalstats-samtools_flagstat-"),
              starts_with("Samtools: flagstat_mqc-generalstats-samtools_flagstat-")) %>%
   rename(microsample=Sample) %>%
  group_by(microsample) %>%
  summarise(reads_mapped=sum(reads_mapped),reads_mapped_percent=mean(reads_mapped_percent))
quality_stats <- read_stats %>%
    left_join(host_mapping_stats, by=join_by(microsample==microsample)) %>%
    left_join(human_mapping_stats, by=join_by(microsample==microsample)) %>%
    left_join(quantification_stats, by=join_by(microsample==microsample))

3.2 Individual overview

3.2.1 Sequencing depth

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=total_sequences,y=microsample,fill=protocol))+
      geom_col()+
      scale_fill_manual(values=c("#6375ad","#d1b454","#d6885e")) +
      geom_vline(xintercept=10000000, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Number of reads", y="Microsamples", fill="Library protocol")

3.2.2 Sequence duplication

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=percent_duplicates,y=microsample,fill=protocol))+
      geom_col()+
      scale_fill_manual(values=c("#6375ad","#d1b454","#d6885e")) +
      geom_vline(xintercept=65, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Percentage of duplicates", y="Microsamples", fill="Library protocol")

3.2.3 GC %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=percent_gc,y=microsample,fill=protocol))+
      geom_col()+
      scale_fill_manual(values=c("#6375ad","#d1b454","#d6885e")) +
      geom_vline(xintercept=60, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Percentage of GC", y="Microsamples", fill="Library protocol")

3.2.4 Host %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=reads_mapped_host_percent,y=microsample,fill=protocol))+
      geom_col()+
      scale_fill_manual(values=c("#6375ad","#d1b454","#d6885e")) +
      geom_vline(xintercept=55, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Host %", y="Microsamples", fill="Library protocol")

3.2.5 Human %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=reads_mapped_human_percent,y=microsample,fill=protocol))+
      geom_col()+
      scale_fill_manual(values=c("#6375ad","#d1b454","#d6885e")) +
      geom_vline(xintercept=55, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Human %", y="Microsamples", fill="Library protocol")

3.2.6 Bacteria mapping %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=reads_mapped_percent,y=microsample,fill=protocol))+
      geom_col()+
      scale_fill_manual(values=c("#6375ad","#d1b454","#d6885e")) +
      geom_vline(xintercept=60, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Mapped to MAGs (%)", y="Microsamples", fill="Library protocol")

3.3 Biplots

3.3.1 Sequencing depth vs. GC %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    filter(type == "Positive") %>%
    ggplot(aes(x=percent_gc,y=total_sequences,color=protocol))+
      geom_point()+
      scale_color_manual(values=c("#6375ad","#d1b454","#d6885e")) +
      facet_nested(. ~ batch, scales="free") +
      labs(color="Library protocol")

3.3.2 Duplicates vs. GC %

quality_stats %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    filter(type == "Positive") %>%
    ggplot(aes(x=percent_gc,y=percent_duplicates,color=protocol))+
      geom_point()+
      scale_color_manual(values=c("#6375ad","#d1b454","#d6885e")) +
      facet_nested(. ~ batch, scales="free")+
      labs(color="Library protocol")

3.4 Quality flagging

quality <- quality_stats %>%
    mutate(depth = case_when(
        total_sequences <= 10000000 ~ 0,
        total_sequences > 10000000 ~ 1,
        TRUE ~ NA)) %>%
    mutate(duplicates = case_when(
        percent_duplicates >= 65 ~ 0,
        percent_duplicates < 65 ~ 1,
        TRUE ~ NA)) %>%
    mutate(gc = case_when(
        percent_gc >= 60 ~ 0,
        percent_gc < 60 ~ 1,
        TRUE ~ NA)) %>%
    mutate(human = case_when(
        reads_mapped_human_percent >= 5 ~ 0,
        reads_mapped_human_percent < 5 ~ 1,
        TRUE ~ NA)) %>%
    mutate(bacteria = case_when(
        reads_mapped_percent <= 60 ~ 0,
        reads_mapped_percent > 60 ~ 1,
        TRUE ~ NA)) %>%
    select(microsample, depth, duplicates, gc, human, bacteria) %>%
    mutate(quality = depth + duplicates + gc + human + bacteria) %>%
    select(microsample, quality)

quality %>% write_tsv("results/quality.tsv")

3.4.1 Quality overview

quality %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    ggplot(aes(x=quality,y=microsample,fill=protocol))+
      geom_col()+
      scale_fill_manual(values=c("#6375ad","#d1b454","#d6885e")) +
      geom_vline(xintercept=5, linetype="dashed", color = "red", size=1) + 
      facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
      theme(strip.text.y.left = element_text(angle = 0)) +
      labs(x="Quality score", y="Microsamples", fill="Library protocol")

quality %>%
    left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
    filter(section != "Ileum") %>%
    filter(type == "Positive") %>%
    group_by(protocol) %>%
    summarise(average=mean(quality, na.rm=TRUE), percentage_5 = mean(quality == 5, na.rm = TRUE) * 100) %>%
    tt()
tinytable_dwu9z6emugamw71x43tw
protocol average percentage_5
full_celero 4.071429 67.85714
full_ulv2 4.583333 77.77778
half_ulv2 4.500000 85.71429